{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "da11e5a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import warnings\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.base import BaseEstimator\n",
    "warnings.simplefilter('ignore')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f815a26a",
   "metadata": {},
   "source": [
    "To load the hdmpy package which is used in the code below run the method below run from a jupyter notebook cell the code below\n",
    "```\n",
    "!pip install multiprocess\n",
    "!pip install pyreadr\n",
    "!git clone https://github.com/maxhuppertz/hdmpy.git\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d465905f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.insert(1, \"./hdmpy\")\n",
    "import hdmpy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5d475d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# An estimator class that runs the lasso with theoretically driven penalty choice.\n",
    "# Better in small samples than cross-validation and also faster computationally\n",
    "class RLasso(BaseEstimator):\n",
    "    \n",
    "    def __init__(self, *, post=False):\n",
    "        self.post = post\n",
    "\n",
    "    def fit(self, X, y):\n",
    "        self.rlasso_ = hdmpy.rlasso(X, y, post=self.post)\n",
    "        return self\n",
    "\n",
    "    @property\n",
    "    def coef_(self):\n",
    "        return np.array(self.rlasso_.est['beta']).flatten()\n",
    "\n",
    "    def predict(self, X):\n",
    "        return X @ self.coef_ + np.array(self.rlasso_.est['intercept'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8c3a418",
   "metadata": {},
   "outputs": [],
   "source": [
    "# A simple experimental data generating process. No effect heterogeneity.\n",
    "def gen_data(n, d, p, delta, base):\n",
    "    X = np.random.normal(0, 1, size=(n, d))\n",
    "    D = np.random.binomial(1, p, size=(n,))\n",
    "    y0 = base - X[:, 0] + np.random.normal(0, .1, size=(n,))\n",
    "    y1 = delta + base - X[:, 0] + np.random.normal(0, .1, size=(n,))\n",
    "    y = y1 * D + y0 * (1 - D)\n",
    "    return y, D, X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "258d88f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "n = 100 # n samples\n",
    "d = 100 # n features\n",
    "delta = 1.0 # treatment effect\n",
    "base = .3 # baseline outcome"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "69d66c15",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Simple two means estimate and calcualtion of variance\n",
    "def twomeans(y, D):\n",
    "    hat0 = np.mean(y[D==0]) # mean of outcome of un-treated\n",
    "    hat1 = np.mean(y[D==1]) # mean of outcome of treated\n",
    "    V0 = np.var(y[D==0]) / np.mean(1 - D) # asymptotic variance of the mean of outcome of untreated\n",
    "    V1 = np.var(y[D==1]) / np.mean(D) # asymptotic variance of the mean of outcome of treated\n",
    "    hat = hat1 - hat0 # estimate of effect\n",
    "    stderr = np.sqrt((V0 + V1) / n) # standard error of estimate of effect\n",
    "    return hat, stderr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db73000b",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(123)\n",
    "y, D, X = gen_data(n, d, .2, delta, base) # generate RCT data\n",
    "twomeans(y, D) # calculate estimation quantities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f4535a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "# We implement the partialling out version of OLS (for pedagogical purposes)\n",
    "def partialling_out(y, D, W):\n",
    "    yres = y - LinearRegression().fit(W, y).predict(W) # residualize outcome with OLS\n",
    "    Dres = D - LinearRegression().fit(W, D).predict(W) # residualize treatment with OLS\n",
    "    hat = np.mean(yres * Dres) / np.mean(Dres**2) # calculate final residual ~ residual ols estimate\n",
    "    epsilon = yres - hat * Dres # calculate residual of final regression; epsilon in the BLP decomposition\n",
    "    V = np.mean(epsilon**2 * Dres**2) / np.mean(Dres**2)**2 # calculate variance of effect\n",
    "    return hat, np.sqrt(V / y.shape[0]) # return estimate and standard error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f718a18",
   "metadata": {},
   "outputs": [],
   "source": [
    "partialling_out(y, D, np.hstack([D*X, X]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84e660b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Now we simply replace OLS with Lasso to implement the Double Lasso process\n",
    "def double_lasso(y, D, W):\n",
    "    yres = y - RLasso().fit(W, y).predict(W) # residualize outcome with Lasso\n",
    "    Dres = D - RLasso().fit(W, D).predict(W) # residualize treatment with Lasso\n",
    "    # rest is the same as in the OLS case\n",
    "    hat = np.mean(yres * Dres) / np.mean(Dres**2)\n",
    "    epsilon = yres - hat * Dres\n",
    "    V = np.mean(epsilon**2 * Dres**2) / np.mean(Dres**2)**2\n",
    "    return hat, np.sqrt(V / y.shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c8b1cfd5",
   "metadata": {},
   "outputs": [],
   "source": [
    "double_lasso(y, D, np.hstack([D*X, X]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e58c0e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# We now check the distributional properties of the different estimators across experiments\n",
    "# First is the simple two means estimate\n",
    "cov, hats, stderrs = [], [], [] # we will keep track of coverage (truth is in CI) and of the point estimate and stderr\n",
    "for _ in range(100):\n",
    "    y, D, X = gen_data(n, d, .2, delta, base)\n",
    "    hat, stderr = twomeans(y, D)\n",
    "    ci = [hat - 1.96 * stderr, hat + 1.96 * stderr] # 95% confidence interval\n",
    "    hats += [hat]\n",
    "    stderrs += [stderr]\n",
    "    cov += [(ci[0] <= delta) & (delta <= ci[1])] # 1 if CI contains the true parameter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67f2a63d",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(cov) # average coverage (should be .95 ideally)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "31e49a00",
   "metadata": {},
   "outputs": [],
   "source": [
    "# distribution of estimates\n",
    "plt.hist(hats)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81766f99",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(hats) # mean of estimate; measures how biased the estimate is (should be =delta ideally)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "174f80f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.std(hats) # standard deviation of estimates; should be close to the standard errors we calculated for the CIs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2789f752",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(stderrs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a3a5c66c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's repeat this for the partialling out process (OLS), controlling for X\n",
    "cov, hats, stderrs = [], [], [] # we will keep track of coverage (truth is in CI) and of the point estimate and stderr\n",
    "for _ in range(100):\n",
    "    y, D, X = gen_data(n, d, .2, delta, base)\n",
    "    hat, stderr = partialling_out(y, D, X)\n",
    "    ci = [hat - 1.96 * stderr, hat + 1.96 * stderr] # 95% confidence interval\n",
    "    hats += [hat]\n",
    "    stderrs += [stderr]\n",
    "    cov += [(ci[0] <= delta) & (delta <= ci[1])] # 1 if CI contains the true parameter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "535ff01b",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(cov)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1a42254",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.hist(hats)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c1cfa87",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(hats) # ols is heavily biased... mean of estimates very far from delta=1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "442740a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.std(hats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc4d40e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(stderrs) # standard error severely under estimates the variance of the estimate; all this is due to overfitting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9e0a09f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Now let's try the double Lasso. Because it's computationally expensive\n",
    "# we'll do the experiments in parallel. Python makes parallelism very simple\n",
    "from joblib import Parallel, delayed # we import these two functions\n",
    "\n",
    "# we wrap our experiment process in a function, which is supposed to run a\n",
    "# a single experiment\n",
    "def exp(it, n, d):\n",
    "    np.random.seed(it) # we draw a different seed for each experiment\n",
    "    y, D, X = gen_data(n, d, .2, delta, base) # we generate data\n",
    "    hat, stderr = double_lasso(y, D, X) # we apply the double lasso process\n",
    "    ci = [hat - 1.96 * stderr, hat + 1.96 * stderr]\n",
    "    # return estimate, standard error and (1 if CI contains the true parameter)\n",
    "    return hat, stderr, (ci[0] <= delta) & (delta <= ci[1])\n",
    "\n",
    "# Now here is how you run any function in parallel multiple times\n",
    "# It's a simple parallel for loop.\n",
    "res = Parallel(n_jobs=-1, verbose=3)(delayed(exp)(it, n, d) for it in range(100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "45cdfbba",
   "metadata": {},
   "outputs": [],
   "source": [
    "# This simply takes the list of triples and turns it into a triple of lists :)\n",
    "# good trick to know\n",
    "hats, stderrs, cov = zip(*res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "48544788",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(cov) # much better coverage than OLS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58334e8e",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.hist(hats)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1fc08ca4",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(hats) # much closer to 1... (almost the same as two-means)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2194a541",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.std(hats) # standard deviation much smaller than two means, which did not adjust for X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be61450d",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(stderrs) # and close to the calculate standard errors; we correctly estimated uncertainty"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "394859c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's see what would happen if we just run a single lasso\n",
    "from joblib import Parallel, delayed\n",
    "\n",
    "def exp(it, n, d):\n",
    "    np.random.seed(it)\n",
    "    y, D, X = gen_data(n, d, .2, delta, base)\n",
    "    hat = RLasso().fit(np.hstack([D.reshape(-1, 1), X]), y).coef_[0]\n",
    "    return hat # no obvious way to account for uncertainty\n",
    "\n",
    "res = Parallel(n_jobs=-1, verbose=3)(delayed(exp)(it, n, d) for it in range(100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "893d95ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.hist(res)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1711931",
   "metadata": {},
   "outputs": [],
   "source": [
    "# bias is comparable and larger than standard deviation.\n",
    "# Even if we could estimate the standard deviation, confidence intervals would undercover\n",
    "1 - np.mean(res), np.std(res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3fb42054",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's try adding a post-Lasso OLS step and construct confidence\n",
    "# intervals ignoring the Lasso step\n",
    "from joblib import Parallel, delayed\n",
    "\n",
    "def exp(it, n, d):\n",
    "    np.random.seed(it)\n",
    "    y, D, X = gen_data(n, d, .2, delta, base)\n",
    "    # run a big lasso y ~ D, X\n",
    "    DX = np.hstack([D.reshape(-1, 1), X])\n",
    "    coefs = RLasso().fit(DX, y).coef_[1:]\n",
    "    # run OLS on y ~ D, X[chosen by lasso]\n",
    "    # calculate standard error as if lasso step never happened\n",
    "    hat, stderr = partialling_out(y, D - np.mean(D), X[:, np.abs(coefs)>0.0])\n",
    "    ci = [hat - 1.96 * stderr, hat + 1.96 * stderr]\n",
    "    return hat, stderr, (ci[0] <= delta) & (delta <= ci[1])\n",
    "\n",
    "res = Parallel(n_jobs=-1, verbose=3)(delayed(exp)(it, n, d) for it in range(100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25303e2a",
   "metadata": {},
   "outputs": [],
   "source": [
    "hats, stderrs, cov = zip(*res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "68b51670",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(cov) # not bad"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f538442b",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.hist(hats)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c71cd7d",
   "metadata": {},
   "outputs": [],
   "source": [
    "1 - np.mean(hats), np.std(hats) # quite un-biased; bias < standard deviation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b74cf552",
   "metadata": {},
   "outputs": [],
   "source": [
    "# we under-estimated a bit the uncertainty; smaller estimated stderr than true std. \n",
    "# this is most prob a finite sample error, from ignoring the lasso variable selection step\n",
    "# this is an RCT and so even post lasso ols is Neyman orthogonal. We should expect good behavior.\n",
    "np.mean(stderrs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b0eeedd3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# But what if we are not in an RCT!?\n",
    "import scipy.special\n",
    "def gen_data(n, d, p, delta, base):\n",
    "    X = np.random.normal(0, 1, size=(n, d))\n",
    "    D = X[:, 0] + np.random.normal(0, 1/4, size=(n,))\n",
    "    y = delta * D + base - X[:, 0] + np.random.normal(0, 1, size=(n,))\n",
    "    return y, D, X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd843f2e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from joblib import Parallel, delayed\n",
    "\n",
    "def exp(it, n, d):\n",
    "    np.random.seed(it)\n",
    "    y, D, X = gen_data(n, d, .2, delta, base)\n",
    "    DX = np.hstack([D.reshape(-1, 1), X])\n",
    "    coefs = RLasso().fit(DX, y).coef_[1:]\n",
    "    hat, stderr = partialling_out(y, D, X[:, np.abs(coefs)>0.0])\n",
    "    ci = [hat - 1.96 * stderr, hat + 1.96 * stderr]\n",
    "    return hat, stderr, (ci[0] <= delta) & (delta <= ci[1])\n",
    "\n",
    "res = Parallel(n_jobs=-1, verbose=3)(delayed(exp)(it, n, d) for it in range(100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26f93c1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "hats, stderrs, cov = zip(*res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b30e1d54",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(cov) # Oops! Post Lasso OLS severely undercovers; It is not Neyman orthogonal when D is correlated with X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea9e640f",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.hist(hats)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a725e335",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(hats) # very heavily biased"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5d52c197",
   "metadata": {},
   "outputs": [],
   "source": [
    "# But let's try the Neyman orthogonal Double Lasso\n",
    "from joblib import Parallel, delayed\n",
    "\n",
    "def exp(it, n, d):\n",
    "    np.random.seed(it)\n",
    "    y, D, X = gen_data(n, d, .2, delta, base)\n",
    "    hat, stderr = double_lasso(y, D, X) # we apply the double lasso process\n",
    "    ci = [hat - 1.96 * stderr, hat + 1.96 * stderr]\n",
    "    return hat, stderr, (ci[0] <= delta) & (delta <= ci[1])\n",
    "\n",
    "res = Parallel(n_jobs=-1, verbose=3)(delayed(exp)(it, n, d) for it in range(100))\n",
    "hats, stderrs, cov = zip(*res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c289eb3",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(cov) # great coverage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "add24767",
   "metadata": {},
   "outputs": [],
   "source": [
    "1 - np.mean(hats), np.std(hats) # very small bias compared to standard deviation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a89f4c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(stderrs) # accurate estimation of uncertainty"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d33463a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Approximately normal distribution of estimates, centered at the truth\n",
    "plt.hist(hats)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3ae78ae",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
